---
title: "Bernoulli or logistic regression?"
author: "Hedibert Freitas Lopes"
date: "2/8/2020"
output: 
  html_document:
    toc: true
    number_sections: true
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo=FALSE)
```

# Data structure
The following data is taken from example 11 from Bolstad 
(2010) Understanding Computational Bayesian Statistics, Wiley Series in Computational Statistics.  The data is on the age and coronary heart disease status for $n=100$ subjects taken from Hosmer and Lemeshow (1989) 
# Applied Logistic Regression, John Wiley and Sons.

```{r fig.width=9, fig.width=6}
#install.packages("aplore3")
library("aplore3")
data(chdage)
n = nrow(chdage)
y = rep(0,100)
y[chdage$chd=="Yes"]=1
x = chdage$age
plot(x,y,xlab="Age",ylab="coronary heart disease status")
```

# Bernoulli model

In this model, the 0/1 or Yes/no data ${\cal D}=\{y_1,\ldots,y_n\}$ are modeled as conditionally independent and identically distributed (iid) Bernoulli(\theta), i.e.
$$
p(y_i|\theta) = \theta^{y_i}(1-\theta)^{1-y_i}, \qquad y_i \in \{0,1\}, \theta \in (0,1).
$$
Therefore, the number of observed 1's, $s_n=\sum_{i=1}^n y_i$, follows,  conditionally on $\theta$, a Binomial distribution, i.e.
$$
p(s_n|\theta) = \frac{n!}{s_n!(n-s_n!)}\theta^{s_n}(1-\theta)^{n-s_n},
$$
for $s_n \in \{0,1,\ldots,n\}$.  The likelihood function is, then,
$$
L(\theta|s_n) \propto \theta^{s_n}(1-\theta)^{n-s_n},
$$
and it is easy to see that the maximum likelihood estimator of $\theta$ is
$$
{\hat \theta}_{mle}=\frac{s_n}{n}.
$$
```{r fig.width=9, fig.width=6}
sn = sum(y)
theta.mle = sn/n
c(sn,theta.mle)
```
## Bayesian inference 
Bayesian inference is complete when a prior distribution for $\theta$ is set up.  Let us use the conjugate prior for $\theta$, a $Beta(a_0,b_0)$, where 
$$
p(\theta) = \frac{\Gamma(a_0+b_0)}{\Gamma(a_0)\Gamma(b_0)}\theta^{a_0-1}(1-\theta)^{b_0-1}, \qquad \theta \in (0,1), a_0,b_0>0,
$$
with prior mean and variance equal to 
$$
E(\theta) = \frac{a_0}{a_0+b_0} \ \ \mbox{and} \ \ 
V(\theta) = \frac{a_0b_0}{(a_0+b_0)^2(a_0+b_0+1)}.
$$


```{r fig.width=9, fig.width=6}
sn = sum(y)
a0 = 4
b0 = 16
a1 = a0+sn
b1 = b0+n-sn

# Posterior moments and density
mean      = a0/(a0+b0)
stdev     = sqrt(a0*b0/((a0+b0)^2*(a0+b0+1)))
prior.mom = c(mean,stdev,qbeta(c(0.05,0.5,0.95),a0,b0))
mean      = a1/(a1+b1)
stdev     = sqrt(a1*b1/((a1+b1)^2*(a1+b1+1)))
qbeta.m1  = qbeta(c(0.05,0.5,0.95),a1,b1)
post.mom  = c(mean,stdev,qbeta.m1)

table = round(rbind(prior.mom,post.mom),3)
rownames(table) = c("Prior summaries","Posterior summaries")
colnames(table) = c("Mean","St.Dev.","5%","50%","95%")
table
```

```{r fig.width=9, fig.width=6}
thetas = seq(0,1,length=1000)
plot(thetas,dbeta(thetas,a1,b1),xlab=expression(theta),ylab="Density",type="l",lwd=2)
lines(thetas,dbeta(thetas,a0,b0),col=2,lwd=2)
points(sn/n,0,pch=16,col=3,cex=2)
legend("topright",legend=c("Data","Prior","Posterior"),col=3:1,lty=1,lwd=2)
```

## Prior predictive of $s_n=\sum_{i=1}^n y_i$ given $(n,a_0,b_0)$

```{r fig.width=9, fig.width=6}
sn1   = 0:n
term1 = gamma(a0+b0)/(gamma(a0)*gamma(b0))
term2 = gamma(a0+sn1)*gamma(b0+n-sn1)/gamma(a0+b0+n)
term3 = gamma(n+1)/(gamma(sn1+1)*gamma(n-sn1+1))
prob.sn1 = term1*term2*term3
plot(sn1,prob.sn1,xlab=expression(s[n]),ylab="Prior predictive")
points(sn,0,pch=16,col=2)
legend("topright",legend=paste("Upper tail prob = ",round(sum(prob.sn1[sn1>=sn]),4),sep=""))
```

## Posterior predictive of $y_{n+1}$ given $(a_1,b_1)$
It can be shown that
$$
Pr(y_{n+1}=1|s_n,a_0,b_0) = \frac{a_0+s_n}{a_0+b_0+n} = \alpha+\beta {\bar y}_n,
$$
where $\alpha=\alpha_0/(a_0+b_0+n)$ and $\beta=n/(a_0+b_0+n)$ and 
${\bar y}_n = \frac{s_n}{n}$.

```{r fig.width=9, fig.width=6}
prob.next = (a0+sn1)/(a0+b0+n) 
legend = paste("a0=",a0,",","b0=",b0,sep="")
plot(sn1/n,prob.next,type="l",ylim=c(0,1),
     xlab="Sample proportion",ylab="Posterior predictive of one",lwd=2)
a0=0;b0=0;prob.next=(a0+sn1)/(a0+b0+n) 
legend=c(legend,paste("a0=",a0,",","b0=",b0,sep=""))
lines(sn1/n,prob.next,col=2,lwd=2)
a0=30;b0=10;prob.next=(a0+sn1)/(a0+b0+n)
legend=c(legend,paste("a0=",a0,",","b0=",b0,sep=""))
lines(sn1/n,prob.next,col=3,lwd=2)
legend("topleft",legend=legend,col=1:3,lwd=2)
```


# Logit and probit regression models
In the simplest forms, the observation $y_i$ (coronary heart disease status) is related to the covariate $x_i$ (age) via either a logit or probit link function:
\begin{eqnarray*}
\theta_i &=& \frac{1}{1+e^{-(\beta_0+\beta_1 x_i)}}\\
\theta_i &=& \Phi(\beta_0+\beta_1 x_i),
\end{eqnarray*}
respectively.

```{r fig.width=9, fig.width=6}
plot(x+rnorm(n,0,0.2),y,xlab="Age",ylab="coronary heart disease status")
```

## Likelihood function of $(\beta_1,\beta_2)$
Given the data ${\cal D}_n=\{(x_1,y_1),\ldots,(x_n,y_n)_\}$, the likelihood function of $(\beta_0,\beta_1)$ is given by
$$
L(\beta_0,\beta_1|{\cal D}_n) = \prod_{i=1}^n \theta_i^{y_i} (1-\theta_i)^{1-y_i}.
$$
```{r}
loglikelihood.logit = function(beta){
  eta   = beta[1]+beta[2]*x
  theta = exp(eta)/(1+exp(eta))
  return(sum(y*log(theta)+(1-y)*log(1-theta)))
}
loglikelihood.probit = function(beta){
  theta = pnorm(beta[1]+beta[2]*x)
  return(sum(y*log(theta)+(1-y)*log(1-theta)))
}
minusloglike.logit  = function(beta){-loglikelihood.logit(beta)}
minusloglike.probit = function(beta){-loglikelihood.probit(beta)}

beta.mle.logit = round(nlm(minusloglike.logit,c(-5,0.1))$estimate,4)
beta.mle.probit = round(nlm(minusloglike.probit,c(-3,0.07))$estimate,4)


print(paste("betas MLE (logit) : (",beta.mle.logit[1],",",beta.mle.logit[2],")",sep=""))
print(paste("betas MLE (probit) : (",beta.mle.probit[1],",",beta.mle.probit[2],")",sep=""))
```

```{r fig.width=9, fig.width=6}
N      = 100
beta0s = seq(-10,-1.5,length=N)
beta1s = seq(0.01,0.2,length=N)
loglike.logit  = matrix(0,N,N)
loglike.probit = matrix(0,N,N)
for (i in 1:N)
  for (j in 1:N){
    loglike.logit[i,j] = loglikelihood.logit(c(beta0s[i],beta1s[j]))
    loglike.probit[i,j] = loglikelihood.probit(c(beta0s[i],beta1s[j]))
}
par(mfrow=c(1,2))
contour(beta0s,beta1s,exp(loglike.logit),drawlabels=FALSE,
        xlab=expression(beta[0]),ylab=expression(beta[1]),
        main="Logit model")
points(beta.mle.logit[1],beta.mle.logit[2],col=2,pch=16)
contour(beta0s,beta1s,exp(loglike.probit),drawlabels=FALSE,
        xlab=expression(beta[0]),ylab=expression(beta[1]),
        main="Probit model")
points(beta.mle.probit[1],beta.mle.probit[2],col=2,pch=16)
```

## Fitted logit and probit regressions
```{r fig.width=9, fig.width=6}
xx = 0:100
eta.logit   = beta.mle.logit[1]+beta.mle.logit[2]*xx
eta.probit  = beta.mle.probit[1]+beta.mle.probit[2]*xx
prob.logit  = exp(eta.logit)/(1+exp(eta.logit))
prob.probit = pnorm(eta.probit)

plot(x,y,xlab="Age",ylab="coronary heart disease status")
lines(xx,prob.logit,col=2,lwd=2)
lines(xx,prob.probit,col=3,lwd=2)
```

## Bayesian inference via SIR
Below we assume a simple bivariate uniform distribution for $(\beta_0,\beta_1)$ in the rectangle ${\cal A}=\{(-10,-1.5) \times (0.01,0.2)\}$ and use sampling importance resampling (SIR) to convert prior draws into resampled posterior draws.
```{r fig.width=9, fig.width=6}
set.seed(2325)
M0 = 20000
M  = 10000
draws.beta0 = runif(M0,-10,-1.5)
draws.beta1 = runif(M0,0.01,0.2)
w.logit     = rep(0,M0)
w.probit    = rep(0,M0)
for (i in 1:M0){
  w.logit[i]  = loglikelihood.logit(c(draws.beta0[i],draws.beta1[i]))
  w.probit[i] = loglikelihood.probit(c(draws.beta0[i],draws.beta1[i]))
}
min = min(w.probit[!is.na(w.probit)])
w.probit[is.na(w.probit)]=min
prob.m2.logit  = mean(exp(w.logit))
prob.m2.probit = mean(exp(w.probit))
w.logit    = exp(w.logit-max(w.logit))
w.probit   = exp(w.probit-max(w.probit))
ind.logit  = sample(1:M0,size=M,replace=TRUE,prob=w.logit)
ind.probit = sample(1:M0,size=M,replace=TRUE,prob=w.probit)
beta0.logit = draws.beta0[ind.logit] 
beta1.logit = draws.beta1[ind.logit]
beta0.probit = draws.beta0[ind.probit] 
beta1.probit = draws.beta1[ind.probit]

par(mfrow=c(1,2))
plot(beta0.logit,beta1.logit,xlab=expression(beta[0]),ylab=expression(beta[1]))
points(beta.mle.logit[1],beta.mle.logit[2],col=2,pch=16)
plot(beta0.probit,beta1.probit,xlab=expression(beta[0]),ylab=expression(beta[1]))
points(beta.mle.probit[1],beta.mle.probit[2],col=2,pch=16)


par(mfrow=c(2,2))
hist(beta0.logit,prob=TRUE,xlab="",main=expression(beta[0]))
abline(v=beta.mle.logit[1],col=2,lwd=2)
hist(beta1.logit,prob=TRUE,xlab="",main=expression(beta[1]))
abline(v=beta.mle.logit[2],col=2,lwd=2)
hist(beta0.probit,prob=TRUE,xlab="",main=expression(beta[0]))
abline(v=beta.mle.probit[1],col=2,lwd=2)
hist(beta1.probit,prob=TRUE,xlab="",main=expression(beta[1]))
abline(v=beta.mle.probit[2],col=2,lwd=2)
```

# Model comparison

## Model comparison via posterior predictive summaries
```{r fig.width=9, fig.width=6}
xx = 0:100
N = length(xx)
prob.bayes.logit = matrix(0,M,N)
prob.bayes.probit = matrix(0,M,N)
for (i in 1:M){
  prob.bayes.logit[i,] = 1/(1+exp(-beta0.logit[i]-beta1.logit[i]*xx))
  prob.bayes.probit[i,] = pnorm(beta0.probit[i]+beta1.probit[i]*xx)
}
prob.bayes.logit = t(apply(prob.bayes.logit,2,quantile,c(0.05,0.5,0.95)))
prob.bayes.probit = t(apply(prob.bayes.probit,2,quantile,c(0.05,0.5,0.95)))

par(mfrow=c(1,1))
plot(x,y,xlim=range(xx),xlab="Age",ylab="coronary heart disease status")
lines(xx,prob.bayes.logit[,2],col=2,lwd=2)
lines(xx,prob.bayes.logit[,1],col=2,lwd=2,lty=2)
lines(xx,prob.bayes.logit[,3],col=2,lwd=2,lty=2)
lines(xx,prob.bayes.probit[,2],col=3,lwd=2)
lines(xx,prob.bayes.probit[,1],col=3,lwd=2,lty=2)
lines(xx,prob.bayes.probit[,3],col=3,lwd=2,lty=2)
abline(h=qbeta.m1[1],col=6,lty=2)
abline(h=qbeta.m1[2],col=6)
abline(h=qbeta.m1[3],col=6,lty=2)
legend("bottomright",legend=c("iid","logit","probit"),col=c(6,2,3),lty=1,lwd=2)
#legend("topleft",legend=c("OLS","GLS","BAYES","Bernoulli"),col=c(2,4,3,6),lty=1,lwd=2)
```

## Computing Bayes factors and posterior model probabilities

```{r}
term1   = gamma(a0+b0)/(gamma(a0)*gamma(b0))
term2   = gamma(a0+sn)*gamma(b0+n-sn)/gamma(a0+b0+n)
term3   = gamma(n+1)/(gamma(sn+1)*gamma(n-sn+1))
prob.m1      = term1*term2*term3/(gamma(n+1)/gamma(sn+1)/gamma(n-sn+1))
BF12         = prob.m1/prob.m2.logit
BF32         = prob.m2.probit/prob.m2.logit
PMP.m1       = prob.m1/(prob.m1+prob.m2.logit+prob.m2.probit)
PMP.m2.logit = prob.m2.logit/(prob.m1+prob.m2.logit+prob.m2.probit)
PMP.m2.probit= 1-PMP.m1-PMP.m2.logit
print(paste("Prior predictive M1 = ",prob.m1,sep=""))
print(paste("Prior predictive M2 logit = ",prob.m2.logit,sep=""))
print(paste("Prior predictive M2 probit = ",prob.m2.probit,sep=""))
print(paste("Bayes factor M1 vs M2 logit = ",round(BF12,3),sep=""))
print(paste("Bayes factor M2 probit vs M2 logit = ",round(BF32,3),sep=""))
print(paste("Posterior model probability for M1 = ",round(PMP.m1,3),sep=""))
print(paste("Posterior model probability for M2 logit = ",round(PMP.m2.logit,3),sep=""))
print(paste("Posterior model probability for M1 probit = ",round(PMP.m2.probit,3),sep=""))
```